# Importer les librairies
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
import pandas as pd
import glob
from sklearn.metrics import confusion_matrix
import IPython.display as ipd # pour ecouter le son
import os
import sys
import warnings
# ignorer les avertissements de type deprecation
if not sys.warnoptions:
warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
#for dirname, _, filenames in os.walk('/kaggle/input'):
# for filename in filenames:
# print(os.path.join(dirname, filename))
TESS = "/home/vm/kaggle/input/TESS/tess toronto emotional speech set data/TESS Toronto emotional speech set data/"
RAV = "/home/vm/kaggle/input/RAVDESS/audio_speech_actors_01-24/"
SAVEE = "/home/vm/kaggle/input/SAVEE/ALL/"
CREMA = "/home/vm/kaggle/input/CREMA-D/AudioWAV/"
# tester un example
dir_list = os.listdir(SAVEE)
dir_list[0:5]
# parser le nom du fichier pour avoir les emotions
emotion=[]
path = []
for i in dir_list:
if i[-8:-6]=='_a':
emotion.append('male_angry')
elif i[-8:-6]=='_d':
emotion.append('male_disgust')
elif i[-8:-6]=='_f':
emotion.append('male_fear')
elif i[-8:-6]=='_h':
emotion.append('male_happy')
elif i[-8:-6]=='_n':
emotion.append('male_neutral')
elif i[-8:-6]=='sa':
emotion.append('male_sad')
elif i[-8:-6]=='su':
emotion.append('male_surprise')
else:
emotion.append('male_error')
path.append(SAVEE + i)
# verifier le nombre de donnés associé à chaque label
SAVEE_df = pd.DataFrame(emotion, columns = ['labels'])
SAVEE_df['source'] = 'SAVEE'
SAVEE_df = pd.concat([SAVEE_df, pd.DataFrame(path, columns = ['path'])], axis = 1)
SAVEE_df.labels.value_counts()
# On utilise la librairie Librosa
fname = SAVEE + 'DC_f11.wav'
data, sampling_rate = librosa.load(fname)
plt.figure(figsize=(15, 5))
librosa.display.waveplot(data, sr=sampling_rate)
# Un exemple audio
ipd.Audio(fname)
# On essaye avec une piste d'une personne hereuse
fname = SAVEE + 'DC_h11.wav'
data, sampling_rate = librosa.load(fname)
plt.figure(figsize=(15, 5))
librosa.display.waveplot(data, sr=sampling_rate)
# Jouer la piste audio
ipd.Audio(fname)
#RAVDESS
dir_list = os.listdir(RAV)
dir_list.sort()
emotion = []
gender = []
path = []
for i in dir_list:
fname = os.listdir(RAV + i)
for f in fname:
part = f.split('.')[0].split('-')
emotion.append(int(part[2]))
temp = int(part[6])
if temp%2 == 0:
temp = "female"
else:
temp = "male"
gender.append(temp)
path.append(RAV + i + '/' + f)
RAV_df = pd.DataFrame(emotion)
RAV_df = RAV_df.replace({1:'neutral', 2:'neutral', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'})
RAV_df = pd.concat([pd.DataFrame(gender),RAV_df],axis=1)
RAV_df.columns = ['gender','emotion']
RAV_df['labels'] =RAV_df.gender + '_' + RAV_df.emotion
RAV_df['source'] = 'RAVDESS'
RAV_df = pd.concat([RAV_df,pd.DataFrame(path, columns = ['path'])],axis=1)
RAV_df = RAV_df.drop(['gender', 'emotion'], axis=1)
RAV_df.labels.value_counts()
# On choisit une piste d'une personne qui a peur
fname = RAV + 'Actor_14/03-01-06-02-02-02-14.wav'
data, sampling_rate = librosa.load(fname)
plt.figure(figsize=(15, 5))
librosa.display.waveplot(data, sr=sampling_rate)
# Jouer la piste audio
ipd.Audio(fname)
# On choisit une piste d'une personne hereuse
fname = RAV + 'Actor_14/03-01-03-02-02-02-14.wav'
data, sampling_rate = librosa.load(fname)
plt.figure(figsize=(15, 5))
librosa.display.waveplot(data, sr=sampling_rate)
# Jouer la piste audio
ipd.Audio(fname)
#TESS
#Les emotions sont deja organisés sur plusieurs dossiers
dir_list = os.listdir(TESS)
dir_list.sort()
dir_list
path = []
emotion = []
for i in dir_list:
fname = os.listdir(TESS + i)
for f in fname:
if i == 'OAF_angry' or i == 'YAF_angry':
emotion.append('female_angry')
elif i == 'OAF_disgust' or i == 'YAF_disgust':
emotion.append('female_disgust')
elif i == 'OAF_Fear' or i == 'YAF_fear':
emotion.append('female_fear')
elif i == 'OAF_happy' or i == 'YAF_happy':
emotion.append('female_happy')
elif i == 'OAF_neutral' or i == 'YAF_neutral':
emotion.append('female_neutral')
elif i == 'OAF_Pleasant_surprise' or i == 'YAF_pleasant_surprised':
emotion.append('female_surprise')
elif i == 'OAF_Sad' or i == 'YAF_sad':
emotion.append('female_sad')
else:
emotion.append('Unknown')
path.append(TESS + i + "/" + f)
TESS_df = pd.DataFrame(emotion, columns = ['labels'])
TESS_df['source'] = 'TESS'
TESS_df = pd.concat([TESS_df,pd.DataFrame(path, columns = ['path'])],axis=1)
TESS_df.labels.value_counts()
# On choisit une piste d'une personne qui a peur
fname = TESS + 'YAF_fear/YAF_dog_fear.wav'
data, sampling_rate = librosa.load(fname)
plt.figure(figsize=(15, 5))
librosa.display.waveplot(data, sr=sampling_rate)
# Jouer la piste audio
ipd.Audio(fname)
# On choisit une piste d'une personne heureuse
fname = TESS + 'YAF_happy/YAF_dog_happy.wav'
data, sampling_rate = librosa.load(fname)
plt.figure(figsize=(15, 5))
librosa.display.waveplot(data, sr=sampling_rate)
# Jouer la piste audio
ipd.Audio(fname)
#CREMA-D dataset
dir_list = os.listdir(CREMA)
dir_list.sort()
print(dir_list[0:10])
gender = []
emotion = []
path = []
female = [1002,1003,1004,1006,1007,1008,1009,1010,1012,1013,1018,1020,1021,1024,1025,1028,1029,1030,1037,1043,1046,1047,1049,
1052,1053,1054,1055,1056,1058,1060,1061,1063,1072,1073,1074,1075,1076,1078,1079,1082,1084,1089,1091]
for i in dir_list:
part = i.split('_')
if int(part[0]) in female:
temp = 'female'
else:
temp = 'male'
gender.append(temp)
if part[2] == 'SAD' and temp == 'male':
emotion.append('male_sad')
elif part[2] == 'ANG' and temp == 'male':
emotion.append('male_angry')
elif part[2] == 'DIS' and temp == 'male':
emotion.append('male_disgust')
elif part[2] == 'FEA' and temp == 'male':
emotion.append('male_fear')
elif part[2] == 'HAP' and temp == 'male':
emotion.append('male_happy')
elif part[2] == 'NEU' and temp == 'male':
emotion.append('male_neutral')
elif part[2] == 'SAD' and temp == 'female':
emotion.append('female_sad')
elif part[2] == 'ANG' and temp == 'female':
emotion.append('female_angry')
elif part[2] == 'DIS' and temp == 'female':
emotion.append('female_disgust')
elif part[2] == 'FEA' and temp == 'female':
emotion.append('female_fear')
elif part[2] == 'HAP' and temp == 'female':
emotion.append('female_happy')
elif part[2] == 'NEU' and temp == 'female':
emotion.append('female_neutral')
else:
emotion.append('Unknown')
path.append(CREMA + i)
CREMA_df = pd.DataFrame(emotion, columns = ['labels'])
CREMA_df['source'] = 'CREMA'
CREMA_df = pd.concat([CREMA_df,pd.DataFrame(path, columns = ['path'])],axis=1)
CREMA_df.labels.value_counts()
# Toujours en utilisant Libosa on va essayer d'abord de voir le résultat avec une piste d'une personne heureuse
fname = CREMA + '1012_IEO_HAP_HI.wav'
data, sampling_rate = librosa.load(fname)
plt.figure(figsize=(15, 5))
librosa.display.waveplot(data, sr=sampling_rate)
# Jouer la piste audio
ipd.Audio(fname)
# Une piste d'une personne qui a peur
fname = CREMA + '1012_IEO_FEA_HI.wav'
data, sampling_rate = librosa.load(fname)
plt.figure(figsize=(15, 5))
librosa.display.waveplot(data, sr=sampling_rate)
# Jouer la piste audio
ipd.Audio(fname)
df = pd.concat([SAVEE_df, RAV_df, TESS_df, CREMA_df], axis = 0)
print(df.labels.value_counts())
df.head()
df.to_csv("Data_path.csv",index=False)
# Importer les librairies
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import specgram
import pandas as pd
import os
import IPython.display as ipd # Jouer du son sur notebook
# Source - RAVDESS; Gender - Female; Emotion - Angry
path = "/home/vm/kaggle/input/RAVDESS/audio_speech_actors_01-24/Actor_08/03-01-05-02-01-01-08.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)
mfcc = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
# audio wave
plt.figure(figsize=(20, 15))
plt.subplot(3,1,1)
librosa.display.waveplot(X, sr=sample_rate)
plt.title('Audio sampled at 44100 hrz')
# MFCC
plt.figure(figsize=(20, 15))
plt.subplot(3,1,1)
librosa.display.specshow(mfcc, x_axis='time')
plt.ylabel('MFCC')
plt.colorbar()
ipd.Audio(path)
# Source - RAVDESS; Gender - Male; Emotion - Angry
path = "/home/main/kaggle/input/RAVDESS/audio_speech_actors_01-24/Actor_09/03-01-05-01-01-01-09.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)
mfcc = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
# audio wave
plt.figure(figsize=(20, 15))
plt.subplot(3,1,1)
librosa.display.waveplot(X, sr=sample_rate)
plt.title('Audio sampled at 44100 hrz')
# MFCC
plt.figure(figsize=(20, 15))
plt.subplot(3,1,1)
librosa.display.specshow(mfcc, x_axis='time')
plt.ylabel('MFCC')
plt.colorbar()
ipd.Audio(path)
# Source - RAVDESS; Gender - Female; Emotion - Happy
path = "/home/vm/kaggle/input/RAVDESS/audio_speech_actors_01-24/Actor_12/03-01-03-01-02-01-12.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)
mfcc = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
# audio wave
plt.figure(figsize=(20, 15))
plt.subplot(3,1,1)
librosa.display.waveplot(X, sr=sample_rate)
plt.title('Audio sampled at 44100 hrz')
# MFCC
plt.figure(figsize=(20, 15))
plt.subplot(3,1,1)
librosa.display.specshow(mfcc, x_axis='time')
plt.ylabel('MFCC')
plt.colorbar()
ipd.Audio(path)
# Source - RAVDESS; Gender - Male; Emotion - Happy
path = "/home/vm/kaggle/input/RAVDESS/audio_speech_actors_01-24/Actor_11/03-01-03-01-02-02-11.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)
mfcc = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
# audio wave
plt.figure(figsize=(20, 15))
plt.subplot(3,1,1)
librosa.display.waveplot(X, sr=sample_rate)
plt.title('Audio sampled at 44100 hrz')
# MFCC
plt.figure(figsize=(20, 15))
plt.subplot(3,1,1)
librosa.display.specshow(mfcc, x_axis='time')
plt.ylabel('MFCC')
plt.colorbar()
ipd.Audio(path)
# Source - RAVDESS; Gender - Female; Emotion - Angry
path = "/home/vm/kaggle/input/RAVDESS/audio_speech_actors_01-24/Actor_08/03-01-05-02-01-01-08.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)
female = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
female = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13), axis=0)
print(len(female))
# Source - RAVDESS; Gender - Male; Emotion - Angry
path = "/home/vm/kaggle/input/RAVDESS/audio_speech_actors_01-24/Actor_09/03-01-05-01-01-01-09.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)
male = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
male = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13), axis=0)
print(len(male))
# audio wave
plt.figure(figsize=(20, 15))
plt.subplot(3,1,1)
plt.plot(female, label='female')
plt.plot(male, label='male')
plt.legend()
# Source - RAVDESS; Gender - Female; Emotion - happy
path = "/home/vm/kaggle/input/RAVDESS/audio_speech_actors_01-24/Actor_12/03-01-03-01-02-01-12.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)
female = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
female = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13), axis=0)
print(len(female))
# Source - RAVDESS; Gender - Male; Emotion - happy
path = "/home/vm/kaggle/input/RAVDESS/audio_speech_actors_01-24/Actor_11/03-01-03-01-02-02-11.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)
male = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
male = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13), axis=0)
print(len(male))
# Plot the two audio waves together
plt.figure(figsize=(20, 15))
plt.subplot(3,1,1)
plt.plot(female, label='female')
plt.plot(male, label='male')
plt.legend()
# Importer les librairies nécessaires
# Keras
import keras
from keras import regularizers
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model, model_from_json
from keras.layers import Dense, Embedding, LSTM
from keras.layers import Input, Flatten, Dropout, Activation, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint
# sklearn
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# Other
import librosa
import librosa.display
import json
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
import pandas as pd
import seaborn as sns
import glob
import os
import pickle
import IPython.display as ipd # pour jouer du son sur notebook
# lets pick up the meta-data that we got from our first part of the Kernel
ref = pd.read_csv("/home/vm/kaggle/input/data-path/Data_path.csv")
ref.head()
# Note this takes a couple of minutes (~10 mins) as we're iterating over 4 datasets
df = pd.DataFrame(columns=['feature'])
# loop feature extraction over the entire dataset
counter=0
for index,path in enumerate(ref.path):
X, sample_rate = librosa.load(path
, res_type='kaiser_fast'
,duration=2.5
,sr=44100
,offset=0.5
)
sample_rate = np.array(sample_rate)
# mean as the feature. Could do min and max etc as well.
mfccs = np.mean(librosa.feature.mfcc(y=X,
sr=sample_rate,
n_mfcc=13),
axis=0)
df.loc[counter] = [mfccs]
counter=counter+1
# Check a few records to make sure its processed successfully
print(len(df))
df.head()
# Now extract the mean bands to its own feature columns
df = pd.concat([ref,pd.DataFrame(df['feature'].values.tolist())],axis=1)
df[:5]
# replace NA with 0
df=df.fillna(0)
print(df.shape)
df[:5]
# Split between train and test
X_train, X_test, y_train, y_test = train_test_split(df.drop(['path','labels','source'],axis=1)
, df.labels
, test_size=0.25
, shuffle=True
, random_state=42
)
# Lets see how the data present itself before normalisation
X_train[150:160]
# Lts do data normalization
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)
X_train = (X_train - mean)/std
X_test = (X_test - mean)/std
# Check the dataset now
X_train[150:160]
# Lets do some preparation steps to get it into the correct format for Keras
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
# one hot encode the target
lb = LabelEncoder()
y_train = np_utils.to_categorical(lb.fit_transform(y_train))
y_test = np_utils.to_categorical(lb.fit_transform(y_test))
print(X_train.shape)
print(lb.classes_)
#print(y_train[0:10])
#print(y_test[0:10])
# Pickel the lb object for future use
filename = 'labels'
outfile = open(filename,'wb')
pickle.dump(lb,outfile)
outfile.close()
X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)
X_train.shape
# New model
model = Sequential()
model.add(Conv1D(256, 8, padding='same',input_shape=(X_train.shape[1],1))) # X_train.shape[1] = No. of Columns
model.add(Activation('relu'))
model.add(Conv1D(256, 8, padding='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(MaxPooling1D(pool_size=(8)))
model.add(Conv1D(128, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(128, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(128, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(128, 8, padding='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(MaxPooling1D(pool_size=(8)))
model.add(Conv1D(64, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(64, 8, padding='same'))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(14)) # Target class number
model.add(Activation('softmax'))
# opt = keras.optimizers.SGD(lr=0.0001, momentum=0.0, decay=0.0, nesterov=False)
# opt = keras.optimizers.Adam(lr=0.0001)
opt = keras.optimizers.RMSprop(lr=0.00001, decay=1e-6)
model.summary()
model.compile(loss='categorical_crossentropy', optimizer=opt,metrics=['accuracy'])
model_history=model.fit(X_train, y_train, batch_size=16, epochs=100, validation_data=(X_test, y_test))
plt.plot(model_history.history['loss'])
plt.plot(model_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# Save model and weights
model_name = 'Emotion_Model.h5'
save_dir = os.path.join(os.getcwd(), 'saved_models')
if not os.path.isdir(save_dir):
os.makedirs(save_dir)
model_path = os.path.join(save_dir, model_name)
model.save(model_path)
print('Save model and weights at %s ' % model_path)
# Save the model to disk
model_json = model.to_json()
with open("model_json.json", "w") as json_file:
json_file.write(model_json)
# loading json and model architecture
json_file = open('model_json.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("saved_models/Emotion_Model.h5")
print("Loaded model from disk")
# Keras optimiser
opt = keras.optimizers.RMSprop(lr=0.00001, decay=1e-6)
loaded_model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
score = loaded_model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))
preds = loaded_model.predict(X_test,
batch_size=16,
verbose=1)
preds=preds.argmax(axis=1)
preds
# predictions
preds = preds.astype(int).flatten()
preds = (lb.inverse_transform((preds)))
preds = pd.DataFrame({'predictedvalues': preds})
# Actual labels
actual=y_test.argmax(axis=1)
actual = actual.astype(int).flatten()
actual = (lb.inverse_transform((actual)))
actual = pd.DataFrame({'actualvalues': actual})
# Lets combined both of them into a single dataframe
finaldf = actual.join(preds)
finaldf[170:180]
# Write out the predictions to disk
finaldf.to_csv('Predictions.csv', index=False)
finaldf.groupby('predictedvalues').count()
# the confusion matrix heat map plot
def print_confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=14):
"""Prints a confusion matrix, as returned by sklearn.metrics.confusion_matrix, as a heatmap.
Arguments
---------
confusion_matrix: numpy.ndarray
The numpy.ndarray object returned from a call to sklearn.metrics.confusion_matrix.
Similarly constructed ndarrays can also be used.
class_names: list
An ordered list of class names, in the order they index the given confusion matrix.
figsize: tuple
A 2-long tuple, the first value determining the horizontal size of the ouputted figure,
the second determining the vertical size. Defaults to (10,7).
fontsize: int
Font size for axes labels. Defaults to 14.
Returns
-------
matplotlib.figure.Figure
The resulting confusion matrix figure
"""
df_cm = pd.DataFrame(
confusion_matrix, index=class_names, columns=class_names,
)
fig = plt.figure(figsize=figsize)
try:
heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
except ValueError:
raise ValueError("Confusion matrix values must be integers.")
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
plt.ylabel('True label')
plt.xlabel('Predicted label')
# Gender recode function
def gender(row):
if row == 'female_disgust' or 'female_fear' or 'female_happy' or 'female_sad' or 'female_surprise' or 'female_neutral':
return 'female'
elif row == 'male_angry' or 'male_fear' or 'male_happy' or 'male_sad' or 'male_surprise' or 'male_neutral' or 'male_disgust':
return 'male'
# Get the predictions file
finaldf = pd.read_csv("Predictions.csv")
classes = finaldf.actualvalues.unique()
classes.sort()
# Confusion matrix
c = confusion_matrix(finaldf.actualvalues, finaldf.predictedvalues)
print(accuracy_score(finaldf.actualvalues, finaldf.predictedvalues))
print_confusion_matrix(c, class_names = classes)
# Classification report
classes = finaldf.actualvalues.unique()
classes.sort()
print(classification_report(finaldf.actualvalues, finaldf.predictedvalues, target_names=classes))
modidf = finaldf
modidf['actualvalues'] = finaldf.actualvalues.replace({'female_angry':'female'
, 'female_disgust':'female'
, 'female_fear':'female'
, 'female_happy':'female'
, 'female_sad':'female'
, 'female_surprise':'female'
, 'female_neutral':'female'
, 'male_angry':'male'
, 'male_fear':'male'
, 'male_happy':'male'
, 'male_sad':'male'
, 'male_surprise':'male'
, 'male_neutral':'male'
, 'male_disgust':'male'
})
modidf['predictedvalues'] = finaldf.predictedvalues.replace({'female_angry':'female'
, 'female_disgust':'female'
, 'female_fear':'female'
, 'female_happy':'female'
, 'female_sad':'female'
, 'female_surprise':'female'
, 'female_neutral':'female'
, 'male_angry':'male'
, 'male_fear':'male'
, 'male_happy':'male'
, 'male_sad':'male'
, 'male_surprise':'male'
, 'male_neutral':'male'
, 'male_disgust':'male'
})
classes = modidf.actualvalues.unique()
classes.sort()
# Confusion matrix
c = confusion_matrix(modidf.actualvalues, modidf.predictedvalues)
print(accuracy_score(modidf.actualvalues, modidf.predictedvalues))
print_confusion_matrix(c, class_names = classes)
# Classification report
classes = modidf.actualvalues.unique()
classes.sort()
print(classification_report(modidf.actualvalues, modidf.predictedvalues, target_names=classes))
modidf = pd.read_csv("Predictions.csv")
modidf['actualvalues'] = modidf.actualvalues.replace({'female_angry':'angry'
, 'female_disgust':'disgust'
, 'female_fear':'fear'
, 'female_happy':'happy'
, 'female_sad':'sad'
, 'female_surprise':'surprise'
, 'female_neutral':'neutral'
, 'male_angry':'angry'
, 'male_fear':'fear'
, 'male_happy':'happy'
, 'male_sad':'sad'
, 'male_surprise':'surprise'
, 'male_neutral':'neutral'
, 'male_disgust':'disgust'
})
modidf['predictedvalues'] = modidf.predictedvalues.replace({'female_angry':'angry'
, 'female_disgust':'disgust'
, 'female_fear':'fear'
, 'female_happy':'happy'
, 'female_sad':'sad'
, 'female_surprise':'surprise'
, 'female_neutral':'neutral'
, 'male_angry':'angry'
, 'male_fear':'fear'
, 'male_happy':'happy'
, 'male_sad':'sad'
, 'male_surprise':'surprise'
, 'male_neutral':'neutral'
, 'male_disgust':'disgust'
})
classes = modidf.actualvalues.unique()
classes.sort()
# Confusion matrix
c = confusion_matrix(modidf.actualvalues, modidf.predictedvalues)
print(accuracy_score(modidf.actualvalues, modidf.predictedvalues))
print_confusion_matrix(c, class_names = classes)
# Classification report
classes = modidf.actualvalues.unique()
classes.sort()
print(classification_report(modidf.actualvalues, modidf.predictedvalues, target_names=classes))
# Importing required libraries
from keras.models import Sequential, Model, model_from_json
import matplotlib.pyplot as plt
import keras
import pickle
import wave # !pip install wave
import os
import pandas as pd
import numpy as np
import sys
import warnings
import librosa
import librosa.display
import IPython.display as ipd # To play sound in the notebook
import pyaudio
import tensorflow as tf
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
RECORD_SECONDS = 4
WAVE_OUTPUT_FILENAME = "test-enregistrement.wav"
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK) #buffer
print("* recording")
frames = []
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data) # 2 bytes(16 bits) per channel
print("* done recording")
stream.stop_stream()
stream.close()
p.terminate()
wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
data, sampling_rate = librosa.load('/home/vm/kaggle/input/happy_audio/Liza-happy-v3.wav')
ipd.Audio('/home/vm//kaggle/input/happy_audio/Liza-happy-v3.wav')
plt.figure(figsize=(15, 5))
librosa.display.waveplot(data, sr=sampling_rate)
# loading json and model architecture
json_file = open('/home/vm//kaggle/input/saved-model/model_json.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("/home/vm//kaggle/input/saved-model/Emotion_Model.h5")
print("Loaded model from disk")
# the optimiser
opt = keras.optimizers.RMSprop(lr=0.00001, decay=1e-6)
loaded_model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
# Lets transform the dataset so we can apply the predictions
X, sample_rate = librosa.load('/home/vm/kaggle/input/happy_audio/Liza-happy-v3.wav'
,res_type='kaiser_fast'
,duration=2.5
,sr=44100
,offset=0.5
)
sample_rate = np.array(sample_rate)
mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13),axis=0)
newdf = pd.DataFrame(data=mfccs).T
newdf
# Apply predictions
newdf= np.expand_dims(newdf, axis=2)
newpred = loaded_model.predict(newdf,
batch_size=16,
verbose=1)
newpred
filename = '/home/vm/kaggle/input/labels/labels'
infile = open(filename,'rb')
lb = pickle.load(infile)
infile.close()
# Get the final predicted label
final = newpred.argmax(axis=1)
final = final.astype(int).flatten()
final = (lb.inverse_transform((final)))
print(final) #emo(final) #gender(final)
# Importing required libraries
# Keras
import keras
from keras import regularizers
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model, model_from_json
from keras.layers import Dense, Embedding, LSTM
from keras.layers import Input, Flatten, Dropout, Activation, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint
# sklearn
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# Other
import librosa
import librosa.display
import json
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
import pandas as pd
import seaborn as sns
import glob
import os
from tqdm import tqdm
import pickle
import IPython.display as ipd # To play sound in the notebook
#########################
# Augmentation methods
#########################
def noise(data):
"""
Adding White Noise.
"""
# you can take any distribution from https://docs.scipy.org/doc/numpy-1.13.0/reference/routines.random.html
noise_amp = 0.05*np.random.uniform()*np.amax(data) # more noise reduce the value to 0.5
data = data.astype('float64') + noise_amp * np.random.normal(size=data.shape[0])
return data
def shift(data):
"""
Random Shifting.
"""
s_range = int(np.random.uniform(low=-5, high = 5)*1000) #default at 500
return np.roll(data, s_range)
def stretch(data, rate=0.8):
"""
Streching the Sound. Note that this expands the dataset slightly
"""
data = librosa.effects.time_stretch(data, rate)
return data
def pitch(data, sample_rate):
"""
Pitch Tuning.
"""
bins_per_octave = 12
pitch_pm = 2
pitch_change = pitch_pm * 2*(np.random.uniform())
data = librosa.effects.pitch_shift(data.astype('float64'),
sample_rate, n_steps=pitch_change,
bins_per_octave=bins_per_octave)
return data
def dyn_change(data):
"""
Random Value Change.
"""
dyn_change = np.random.uniform(low=-0.5 ,high=7) # default low = 1.5, high = 3
return (data * dyn_change)
def speedNpitch(data):
"""
peed and Pitch Tuning.
"""
# you can change low and high here
length_change = np.random.uniform(low=0.8, high = 1)
speed_fac = 1.2 / length_change # try changing 1.0 to 2.0 ... =D
tmp = np.interp(np.arange(0,len(data),speed_fac),np.arange(0,len(data)),data)
minlen = min(data.shape[0], tmp.shape[0])
data *= 0
data[0:minlen] = tmp[0:minlen]
return data
####################################
# the confusion matrix heat map plot
####################################
def print_confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=14):
"""Prints a confusion matrix, as returned by sklearn.metrics.confusion_matrix, as a heatmap.
Arguments
---------
confusion_matrix: numpy.ndarray
The numpy.ndarray object returned from a call to sklearn.metrics.confusion_matrix.
Similarly constructed ndarrays can also be used.
class_names: list
An ordered list of class names, in the order they index the given confusion matrix.
figsize: tuple
A 2-long tuple, the first value determining the horizontal size of the ouputted figure,
the second determining the vertical size. Defaults to (10,7).
fontsize: int
Font size for axes labels. Defaults to 14.
Returns
-------
matplotlib.figure.Figure
The resulting confusion matrix figure
"""
df_cm = pd.DataFrame(
confusion_matrix, index=class_names, columns=class_names,
)
fig = plt.figure(figsize=figsize)
try:
heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
except ValueError:
raise ValueError("Confusion matrix values must be integers.")
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
plt.ylabel('True label')
plt.xlabel('Predicted label')
# Use one audio file in previous parts again
fname = '/home/vm/kaggle/input/SAVEE/ALL/JK_f11.wav'
data, sampling_rate = librosa.load(fname)
plt.figure(figsize=(15, 5))
librosa.display.waveplot(data, sr=sampling_rate)
# Paly it again to refresh our memory
ipd.Audio(data, rate=sampling_rate)
x = noise(data)
plt.figure(figsize=(15, 5))
librosa.display.waveplot(x, sr=sampling_rate)
ipd.Audio(x, rate=sampling_rate)
x = shift(data)
plt.figure(figsize=(15, 5))
librosa.display.waveplot(x, sr=sampling_rate)
ipd.Audio(x, rate=sampling_rate)
x = stretch(data)
plt.figure(figsize=(15, 5))
librosa.display.waveplot(x, sr=sampling_rate)
ipd.Audio(x, rate=sampling_rate)
x = pitch(data, sampling_rate)
plt.figure(figsize=(15, 5))
librosa.display.waveplot(x, sr=sampling_rate)
ipd.Audio(x, rate=sampling_rate)
x = dyn_change(data)
plt.figure(figsize=(15, 5))
librosa.display.waveplot(x, sr=sampling_rate)
ipd.Audio(x, rate=sampling_rate)
x = speedNpitch(data)
plt.figure(figsize=(15, 5))
librosa.display.waveplot(x, sr=sampling_rate)
ipd.Audio(x, rate=sampling_rate)
# lets pick up the meta-data that we got from our first part of the Kernel
ref = pd.read_csv("/home/vm/kaggle/input/data-path/Data_path.csv")
ref.head()
# Note this takes a couple of minutes (~16 mins) as we're iterating over 4 datasets, and with augmentation
df = pd.DataFrame(columns=['feature'])
df_noise = pd.DataFrame(columns=['feature'])
df_speedpitch = pd.DataFrame(columns=['feature'])
cnt = 0
# loop feature extraction over the entire dataset
for i in tqdm(ref.path):
# first load the audio
X, sample_rate = librosa.load(i
, res_type='kaiser_fast'
,duration=2.5
,sr=44100
,offset=0.5
)
# take mfcc and mean as the feature. Could do min and max etc as well.
mfccs = np.mean(librosa.feature.mfcc(y=X,
sr=np.array(sample_rate),
n_mfcc=13),
axis=0)
df.loc[cnt] = [mfccs]
# random shifting (omit for now)
# Stretch
# pitch (omit for now)
# dyn change
# noise
aug = noise(X)
aug = np.mean(librosa.feature.mfcc(y=aug,
sr=np.array(sample_rate),
n_mfcc=13),
axis=0)
df_noise.loc[cnt] = [aug]
# speed pitch
aug = speedNpitch(X)
aug = np.mean(librosa.feature.mfcc(y=aug,
sr=np.array(sample_rate),
n_mfcc=13),
axis=0)
df_speedpitch.loc[cnt] = [aug]
cnt += 1
df.head()
# combine
df = pd.concat([ref,pd.DataFrame(df['feature'].values.tolist())],axis=1)
df_noise = pd.concat([ref,pd.DataFrame(df_noise['feature'].values.tolist())],axis=1)
df_speedpitch = pd.concat([ref,pd.DataFrame(df_speedpitch['feature'].values.tolist())],axis=1)
print(df.shape,df_noise.shape,df_speedpitch.shape)
df = pd.concat([df,df_noise,df_speedpitch],axis=0,sort=False)
df=df.fillna(0)
del df_noise, df_speedpitch
df.head()
# Split between train and test
X_train, X_test, y_train, y_test = train_test_split(df.drop(['path','labels','source'],axis=1)
, df.labels
, test_size=0.25
, shuffle=True
, random_state=42
)
# Lets see how the data present itself before normalisation
X_train[150:160]
# Lts do data normalization
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)
X_train = (X_train - mean)/std
X_test = (X_test - mean)/std
# Check the dataset now
X_train[150:160]
# Lets few preparation steps to get it into the correct format for Keras
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
# one hot encode the target
lb = LabelEncoder()
y_train = np_utils.to_categorical(lb.fit_transform(y_train))
y_test = np_utils.to_categorical(lb.fit_transform(y_test))
print(X_train.shape)
print(lb.classes_)
# Pickel the lb object for future use
filename = 'labels'
outfile = open(filename,'wb')
pickle.dump(lb,outfile)
outfile.close()
X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)
X_train.shape
# New model
model = Sequential()
model.add(Conv1D(256, 8, padding='same',input_shape=(X_train.shape[1],1))) # X_train.shape[1] = No. of Columns
model.add(Activation('relu'))
model.add(Conv1D(256, 8, padding='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(MaxPooling1D(pool_size=(8)))
model.add(Conv1D(128, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(128, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(128, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(128, 8, padding='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(MaxPooling1D(pool_size=(8)))
model.add(Conv1D(64, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(64, 8, padding='same'))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(14)) # Target class number
model.add(Activation('softmax'))
opt = keras.optimizers.RMSprop(lr=0.00001, decay=1e-6)
model.summary()
model.compile(loss='categorical_crossentropy', optimizer=opt,metrics=['accuracy'])
model_history=model.fit(X_train, y_train, batch_size=16, epochs=150, validation_data=(X_test, y_test),verbose=2)
plt.plot(model_history.history['loss'])
plt.plot(model_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# Save model and weights
model_name = 'Emotion_Model_aug.h5'
save_dir = os.path.join(os.getcwd(), 'saved_models')
if not os.path.isdir(save_dir):
os.makedirs(save_dir)
model_path = os.path.join(save_dir, model_name)
model.save(model_path)
print('Save model and weights at %s ' % model_path)
# Save the model to disk
model_json = model.to_json()
with open("model_json_aug.json", "w") as json_file:
json_file.write(model_json)
# loading json and model architecture
json_file = open('model_json_aug.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("saved_models/Emotion_Model_aug.h5")
print("Loaded model from disk")
# Keras optimiser
opt = keras.optimizers.RMSprop(lr=0.00001, decay=1e-6)
loaded_model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
score = loaded_model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))
preds = loaded_model.predict(X_test,
batch_size=16,
verbose=1)
preds=preds.argmax(axis=1)
preds
# predictions
preds = preds.astype(int).flatten()
preds = (lb.inverse_transform((preds)))
preds = pd.DataFrame({'predictedvalues': preds})
# Actual labels
actual=y_test.argmax(axis=1)
actual = actual.astype(int).flatten()
actual = (lb.inverse_transform((actual)))
actual = pd.DataFrame({'actualvalues': actual})
# Lets combined both of them into a single dataframe
finaldf = actual.join(preds)
finaldf[170:180]
# Write out the predictions to disk
finaldf.to_csv('Predictions.csv', index=False)
finaldf.groupby('predictedvalues').count()
# Get the predictions file
finaldf = pd.read_csv("Predictions.csv")
classes = finaldf.actualvalues.unique()
classes.sort()
# Confusion matrix
c = confusion_matrix(finaldf.actualvalues, finaldf.predictedvalues)
print(accuracy_score(finaldf.actualvalues, finaldf.predictedvalues))
print_confusion_matrix(c, class_names = classes)
# Classification report
classes = finaldf.actualvalues.unique()
classes.sort()
print(classification_report(finaldf.actualvalues, finaldf.predictedvalues, target_names=classes))
modidf = finaldf
modidf['actualvalues'] = finaldf.actualvalues.replace({'female_angry':'female'
, 'female_disgust':'female'
, 'female_fear':'female'
, 'female_happy':'female'
, 'female_sad':'female'
, 'female_surprise':'female'
, 'female_neutral':'female'
, 'male_angry':'male'
, 'male_fear':'male'
, 'male_happy':'male'
, 'male_sad':'male'
, 'male_surprise':'male'
, 'male_neutral':'male'
, 'male_disgust':'male'
})
modidf['predictedvalues'] = finaldf.predictedvalues.replace({'female_angry':'female'
, 'female_disgust':'female'
, 'female_fear':'female'
, 'female_happy':'female'
, 'female_sad':'female'
, 'female_surprise':'female'
, 'female_neutral':'female'
, 'male_angry':'male'
, 'male_fear':'male'
, 'male_happy':'male'
, 'male_sad':'male'
, 'male_surprise':'male'
, 'male_neutral':'male'
, 'male_disgust':'male'
})
classes = modidf.actualvalues.unique()
classes.sort()
# Confusion matrix
c = confusion_matrix(modidf.actualvalues, modidf.predictedvalues)
print(accuracy_score(modidf.actualvalues, modidf.predictedvalues))
print_confusion_matrix(c, class_names = classes)
# Classification report
classes = modidf.actualvalues.unique()
classes.sort()
print(classification_report(modidf.actualvalues, modidf.predictedvalues, target_names=classes))
modidf = pd.read_csv("Predictions.csv")
modidf['actualvalues'] = modidf.actualvalues.replace({'female_angry':'angry'
, 'female_disgust':'disgust'
, 'female_fear':'fear'
, 'female_happy':'happy'
, 'female_sad':'sad'
, 'female_surprise':'surprise'
, 'female_neutral':'neutral'
, 'male_angry':'angry'
, 'male_fear':'fear'
, 'male_happy':'happy'
, 'male_sad':'sad'
, 'male_surprise':'surprise'
, 'male_neutral':'neutral'
, 'male_disgust':'disgust'
})
modidf['predictedvalues'] = modidf.predictedvalues.replace({'female_angry':'angry'
, 'female_disgust':'disgust'
, 'female_fear':'fear'
, 'female_happy':'happy'
, 'female_sad':'sad'
, 'female_surprise':'surprise'
, 'female_neutral':'neutral'
, 'male_angry':'angry'
, 'male_fear':'fear'
, 'male_happy':'happy'
, 'male_sad':'sad'
, 'male_surprise':'surprise'
, 'male_neutral':'neutral'
, 'male_disgust':'disgust'
})
classes = modidf.actualvalues.unique()
classes.sort()
# Confusion matrix
c = confusion_matrix(modidf.actualvalues, modidf.predictedvalues)
print(accuracy_score(modidf.actualvalues, modidf.predictedvalues))
print_confusion_matrix(c, class_names = classes)
# Classification report
classes = modidf.actualvalues.unique()
classes.sort()
print(classification_report(modidf.actualvalues, modidf.predictedvalues, target_names=classes))
# Keras
import keras
from keras import regularizers
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model, model_from_json
from keras.layers import Dense, Embedding, LSTM
from keras.layers import Input, Flatten, Dropout, Activation, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
from keras.utils import np_utils, to_categorical
from keras.callbacks import (EarlyStopping, LearningRateScheduler,
ModelCheckpoint, TensorBoard, ReduceLROnPlateau)
from keras import losses, models, optimizers
from keras.activations import relu, softmax
from keras.layers import (Convolution2D, GlobalAveragePooling2D, BatchNormalization, Flatten, Dropout,
GlobalMaxPool2D, MaxPool2D, concatenate, Activation, Input, Dense)
# sklearn
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
# Other
from tqdm import tqdm, tqdm_pandas
import scipy
from scipy.stats import skew
import librosa
import librosa.display
import json
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
import pandas as pd
import seaborn as sns
import glob
import os
import sys
import IPython.display as ipd # To play sound in the notebook
import warnings
# ignore warnings
if not sys.warnoptions:
warnings.simplefilter("ignore")
'''
1. Data Augmentation method
'''
def speedNpitch(data):
"""
Speed and Pitch Tuning.
"""
# you can change low and high here
length_change = np.random.uniform(low=0.8, high = 1)
speed_fac = 1.2 / length_change # try changing 1.0 to 2.0 ... =D
tmp = np.interp(np.arange(0,len(data),speed_fac),np.arange(0,len(data)),data)
minlen = min(data.shape[0], tmp.shape[0])
data *= 0
data[0:minlen] = tmp[0:minlen]
return data
'''
2. Extracting the MFCC feature as an image (Matrix format).
'''
def prepare_data(df, n, aug, mfcc):
X = np.empty(shape=(df.shape[0], n, 216, 1))
input_length = sampling_rate * audio_duration
cnt = 0
for fname in tqdm(df.path):
file_path = fname
data, _ = librosa.load(file_path, sr=sampling_rate
,res_type="kaiser_fast"
,duration=2.5
,offset=0.5
)
# Random offset / Padding
if len(data) > input_length:
max_offset = len(data) - input_length
offset = np.random.randint(max_offset)
data = data[offset:(input_length+offset)]
else:
if input_length > len(data):
max_offset = input_length - len(data)
offset = np.random.randint(max_offset)
else:
offset = 0
data = np.pad(data, (offset, int(input_length) - len(data) - offset), "constant")
# Augmentation?
if aug == 1:
data = speedNpitch(data)
# which feature?
if mfcc == 1:
# MFCC extraction
MFCC = librosa.feature.mfcc(data, sr=sampling_rate, n_mfcc=n_mfcc)
MFCC = np.expand_dims(MFCC, axis=-1)
X[cnt,] = MFCC
else:
# Log-melspectogram
melspec = librosa.feature.melspectrogram(data, n_mels = n_melspec)
logspec = librosa.amplitude_to_db(melspec)
logspec = np.expand_dims(logspec, axis=-1)
X[cnt,] = logspec
cnt += 1
return X
'''
3. Confusion matrix plot
'''
def print_confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=14):
'''Prints a confusion matrix, as returned by sklearn.metrics.confusion_matrix, as a heatmap.
Arguments
---------
confusion_matrix: numpy.ndarray
The numpy.ndarray object returned from a call to sklearn.metrics.confusion_matrix.
Similarly constructed ndarrays can also be used.
class_names: list
An ordered list of class names, in the order they index the given confusion matrix.
figsize: tuple
A 2-long tuple, the first value determining the horizontal size of the ouputted figure,
the second determining the vertical size. Defaults to (10,7).
fontsize: int
Font size for axes labels. Defaults to 14.
Returns
-------
matplotlib.figure.Figure
The resulting confusion matrix figure
'''
df_cm = pd.DataFrame(
confusion_matrix, index=class_names, columns=class_names,
)
fig = plt.figure(figsize=figsize)
try:
heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
except ValueError:
raise ValueError("Confusion matrix values must be integers.")
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
plt.ylabel('True label')
plt.xlabel('Predicted label')
'''
# 4. Create the 2D CNN model
'''
def get_2d_conv_model(n):
''' Create a standard deep 2D convolutional neural network'''
nclass = 14
inp = Input(shape=(n,216,1)) #2D matrix of 30 MFCC bands by 216 audio length.
x = Convolution2D(32, (4,10), padding="same")(inp)
x = BatchNormalization()(x)
x = Activation("relu")(x)
x = MaxPool2D()(x)
x = Dropout(rate=0.2)(x)
x = Convolution2D(32, (4,10), padding="same")(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)
x = MaxPool2D()(x)
x = Dropout(rate=0.2)(x)
x = Convolution2D(32, (4,10), padding="same")(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)
x = MaxPool2D()(x)
x = Dropout(rate=0.2)(x)
x = Convolution2D(32, (4,10), padding="same")(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)
x = MaxPool2D()(x)
x = Dropout(rate=0.2)(x)
x = Flatten()(x)
x = Dense(64)(x)
x = Dropout(rate=0.2)(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)
x = Dropout(rate=0.2)(x)
out = Dense(nclass, activation=softmax)(x)
model = models.Model(inputs=inp, outputs=out)
opt = optimizers.Adam(0.001)
model.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['acc'])
return model
'''
# 5. Other functions
'''
class get_results:
'''
We're going to create a class (blueprint template) for generating the results based on the various model approaches.
So instead of repeating the functions each time, we assign the results into on object with its associated variables
depending on each combination:
1) MFCC with no augmentation
2) MFCC with augmentation
3) Logmelspec with no augmentation
4) Logmelspec with augmentation
'''
def __init__(self, model_history, model ,X_test, y_test, labels):
self.model_history = model_history
self.model = model
self.X_test = X_test
self.y_test = y_test
self.labels = labels
def create_plot(self, model_history):
'''Check the logloss of both train and validation, make sure they are close and have plateau'''
plt.plot(model_history.history['loss'])
plt.plot(model_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
def create_results(self, model):
'''predict on test set and get accuracy results'''
opt = optimizers.Adam(0.001)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
score = model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], score[1]*100))
def confusion_results(self, X_test, y_test, labels, model):
'''plot confusion matrix results'''
preds = model.predict(X_test,
batch_size=16,
verbose=2)
preds=preds.argmax(axis=1)
preds = preds.astype(int).flatten()
preds = (lb.inverse_transform((preds)))
actual = y_test.argmax(axis=1)
actual = actual.astype(int).flatten()
actual = (lb.inverse_transform((actual)))
classes = labels
classes.sort()
c = confusion_matrix(actual, preds)
print_confusion_matrix(c, class_names = classes)
def accuracy_results_gender(self, X_test, y_test, labels, model):
'''Print out the accuracy score and confusion matrix heat map of the Gender classification results'''
preds = model.predict(X_test,
batch_size=16,
verbose=2)
preds=preds.argmax(axis=1)
preds = preds.astype(int).flatten()
preds = (lb.inverse_transform((preds)))
actual = y_test.argmax(axis=1)
actual = actual.astype(int).flatten()
actual = (lb.inverse_transform((actual)))
# print(accuracy_score(actual, preds))
actual = pd.DataFrame(actual).replace({'female_angry':'female'
, 'female_disgust':'female'
, 'female_fear':'female'
, 'female_happy':'female'
, 'female_sad':'female'
, 'female_surprise':'female'
, 'female_neutral':'female'
, 'male_angry':'male'
, 'male_fear':'male'
, 'male_happy':'male'
, 'male_sad':'male'
, 'male_surprise':'male'
, 'male_neutral':'male'
, 'male_disgust':'male'
})
preds = pd.DataFrame(preds).replace({'female_angry':'female'
, 'female_disgust':'female'
, 'female_fear':'female'
, 'female_happy':'female'
, 'female_sad':'female'
, 'female_surprise':'female'
, 'female_neutral':'female'
, 'male_angry':'male'
, 'male_fear':'male'
, 'male_happy':'male'
, 'male_sad':'male'
, 'male_surprise':'male'
, 'male_neutral':'male'
, 'male_disgust':'male'
})
classes = actual.loc[:,0].unique()
classes.sort()
c = confusion_matrix(actual, preds)
print(accuracy_score(actual, preds))
print_confusion_matrix(c, class_names = classes)
ref = pd.read_csv("/home/vm/kaggle/input/data-path/Data_path.csv")
ref.head()
sampling_rate=44100
audio_duration=2.5
n_mfcc = 30
mfcc = prepare_data(ref, n = n_mfcc, aug = 0, mfcc = 1)
# Split between train and test
X_train, X_test, y_train, y_test = train_test_split(mfcc
, ref.labels
, test_size=0.25
, shuffle=True
, random_state=42
)
# one hot encode the target
lb = LabelEncoder()
y_train = np_utils.to_categorical(lb.fit_transform(y_train))
y_test = np_utils.to_categorical(lb.fit_transform(y_test))
# Normalization over frequency axis *** I changed this bit to get better accuracy with data augmentation ***
def fit_X_scaler(X_train):
"""fit StandardScaler,and return StandardScaler object
"""
sc = StandardScaler()
for _, clips in enumerate(X_train):
data_i_truncated = np.squeeze(clips)
sc.partial_fit(data_i_truncated)
return sc
def get_X_scaled(X_train, scaler=None):
"""apply normlization
"""
X_train_new = np.zeros(X_train.shape)
for indx, clips in enumerate(X_train):
data_i_truncated = np.squeeze(clips)
if scaler is not None: # normlize
data_i_truncated = scaler.transform(data_i_truncated)
X_train[indx, :, :, 0] = data_i_truncated
return X_train
sc = fit_X_scaler(X_train)
X_train = get_X_scaled(X_train, sc)
X_test = get_X_scaled(X_test, sc)
# Build CNN model
model = get_2d_conv_model(n=n_mfcc)
model_history = model.fit(X_train, y_train, validation_data=(X_test, y_test),
batch_size=16, verbose = 2, epochs=50)
results = get_results(model_history,model,X_test,y_test, ref.labels.unique())
results.create_plot(model_history)
results.create_results(model)
results.confusion_results(X_test, y_test, ref.labels.unique(), model)